Pancreas TRAs
# ----------------------------------------------------------
# TRAs
# ----------------------------------------------------------
# set working directory to folder with TRA tables (rawdata/TRA Daten)
projectPath <- dirname(rstudioapi::getSourceEditorContext()$path)
setwd(paste(projectPath, "rawdata", "TRA Daten", sep = "/"))
# read in all 6 tables
TRAs1_human <-read_csv("Human_protein_atlas_TRA_5median_genes_annotated.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Ensembl_human_genes = col_character(),
## Chromosome = col_character(),
## Startsite = col_double(),
## Strand = col_double(),
## Band = col_character(),
## Symbol = col_character(),
## EntrezGene = col_double(),
## Unigene = col_character(),
## Tissue_number = col_double(),
## Tissues = col_character(),
## Max_tissue = col_character()
## )
TRAs2_human <-read_tsv("tra.2014.human.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs3_human <-read_tsv("tra.2014.human.roth.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs4_mouse <-read_tsv("tra.2014.mouse.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs5_mouse <-read_tsv("tra.2014.mouse.4301.5x.table.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## ensembl.transcript = col_character(),
## ensembl.gene = col_character(),
## gene.symbol = col_character(),
## entrezID = col_character(),
## refseqID = col_character(),
## unigeneID = col_character(),
## chrom = col_character(),
## startsite = col_double(),
## tiss.number = col_double(),
## tissues = col_character(),
## max.tissue = col_character()
## )
TRAs6_human <-read_tsv("tra.2017.human.gtex.5x.table.tsv",col_types = cols(ensembl.chrom=col_character()))
#col_types to correct the parsing failure
PancreasTRAs1_human <- TRAs1_human[grep(x=TRAs1_human$Max_tissue,"panc"),]
PancreasTRAs2_human <- TRAs2_human[grep(x=TRAs2_human$max.tissue,"Panc"),]
PancreasTRAs3_human <- TRAs3_human[grep(x=TRAs3_human$max.tissue,"Panc"),]
PancreasTRAs4_mouse <- TRAs4_mouse[grep(x=TRAs4_mouse$max.tissue,"panc"),]
PancreasTRAs5_mouse <- TRAs5_mouse[grep(x=TRAs5_mouse$max.tissue,"panc"),]
PancreasTRAs6_human <- TRAs6_human[grep(x=TRAs6_human$max.tissue,"Panc"),]
# create character vectors with pancreas specific genes (for both humans & mice)
pancreas_gene_human <- unique(c(PancreasTRAs1_human$Symbol,PancreasTRAs2_human$gene.symbol,PancreasTRAs6_human$ensembl.symbol))
TRA_pancreas_genes_human <- pancreas_gene_human
# all TRAs
TRA_genes_human <- unique(c(TRAs1_human$Symbol,TRAs2_human$gene.symbol,TRAs6_human$ensembl.symbol))
Creating expression matrix
# ----------------------------------------------------------
# Create vsnrma normalized expression matrix
# ----------------------------------------------------------
set.seed(234) # with this vsnrma gives out the same values each time
# get CEL files
setwd(paste(projectPath, "rawdata", "rawdata breast GSE27830", sep = "/"))
data_breast <- ReadAffy()
# change cdf
data_breast@cdfName <- "HGU133Plus2_Hs_ENST"
# create expression matrix
breast_matrix <- exprs(data_breast)
# store colnames (microarray)
breast_microarray_information <- colnames(breast_matrix)
# vsnrma normalization
breast_vsnrma <- vsnrma(data_breast)
## Calculating Expression
# expression matrix of vsnrm normalized data set
breast_vsnrma_matrix <- exprs(breast_vsnrma)
# cut rownames at "."
rownames(breast_vsnrma_matrix) <- str_replace(rownames(breast_vsnrma_matrix),"\\..*","")
# remove ending .CEL from colnames
colnames(breast_vsnrma_matrix) <- str_remove(colnames(breast_vsnrma_matrix),"_.CEL")
# remove control probes (AFFX)
breast_vsnrma_matrix <- breast_vsnrma_matrix[which(!startsWith(rownames(breast_vsnrma_matrix), "AFFX")),]
# transcript IDs
breast_transcript_names <- rownames(breast_vsnrma_matrix)
# ----------------------------------------------------------
# Create data frames which hold information about microarrays
# ----------------------------------------------------------
# create data frame with information about microarray samples
breast_microarrays <- data.frame(number = substr(breast_microarray_information, 1,9),
row.names = c(1:10))
# replace old IDs sample names
colnames(breast_matrix) <- breast_microarrays[,"number"]
colnames(breast_vsnrma_matrix) <- breast_microarrays[,"number"]
# ----------------------------------------------------------
# Convert transcript IDs to gene symbols
# ----------------------------------------------------------
# get table for converting transcript IDs to gene symbols
setwd(paste(projectPath, "rawdata", "tables" ,sep = "/"))
annotation <- read.csv("ensemble.103.txt")
ensemble_transcripts <- annotation[,"Transcript.stable.ID"]
ensemble_symbol <- annotation[,"HGNC.symbol"]
# name each gene symbol by their respective transcripts
names(ensemble_symbol) <- ensemble_transcripts
# select only those transcripts that are also present in the annotation
breast_GeneExprs <- breast_vsnrma_matrix[rownames(breast_vsnrma_matrix) %in% ensemble_transcripts,]
# create vector with the symbols of the transcript from expression matrix
breast_symbol <- ensemble_symbol[rownames(breast_GeneExprs)]
# use gene symbols instead of transcripts as rownames
rownames(breast_GeneExprs) <- as.character(breast_symbol)
# order genes alphabetically & remove all those rows where "" is the rowname
breast_GeneExprs <- breast_GeneExprs[order(rownames(breast_GeneExprs)),][1097:nrow(breast_GeneExprs),]
# replace colnames with the GSM number of the microarrays
colnames(breast_GeneExprs) <- breast_microarrays[,"number"]
# ----------------------------------------------------------
# Breast cancer specific TRAs
# ----------------------------------------------------------
# create a expression matrix with only those genes
breast_GeneExprs_sub <- breast_GeneExprs[which(rownames(breast_GeneExprs) %in% TRA_pancreas_genes_human),]
# ----------------------------------------------------------
# TRAs for all tissues
# ----------------------------------------------------------
breast_GeneExprs_allTRA <- breast_GeneExprs[which(rownames(breast_GeneExprs) %in% TRA_genes_human),]
# ----------------------------------------------------------
# combining transcripts for same gene via median
# ----------------------------------------------------------
combineGeneExprs_median2 <- function(exprsmatrix){
# create data frame with a column of all the gene symbols
data <- data.frame(geneSymbols = rownames(exprsmatrix),
exprsmatrix,
row.names = NULL)
# group data frame by gene symbols and combine values of one gene by median
combined <- aggregate(data[-1], by = list(data$geneSymbols) , FUN = median)
# convert data frame to matrix (only expression values, no row names)
result <- as.matrix(combined[,-1])
# use gene symbols as row names
rownames(result) <- combined[,1]
# output
result
}
breast_GeneExprs_combined <- combineGeneExprs_median2(breast_GeneExprs)
breast_GeneExprs_sub_combined <- combineGeneExprs_median2(breast_GeneExprs_sub)
Quality control
# ----------------------------------------------------------
# Single chip control
# ----------------------------------------------------------
#setwd(paste(projectPath, "plots", sep = "/"))
for (i in 1:10){
image(data_breast[,i], col = rainbow (100, start = 0, end = 0.75)[100:1])
file.name = paste(as.character(breast_microarrays[, "number"])[i],".pdf", sep = "")
#dev.copy2pdf(file = file.name)
}










## The chips do not appear to be faulty.
# ----------------------------------------------------------
# Pheno Data
# ----------------------------------------------------------
pData(data_breast)
## sample
## GSM687017.CEL 1
## GSM687018.CEL 2
## GSM687019.CEL 3
## GSM687020.CEL 4
## GSM687021.CEL 5
## GSM687022.CEL 6
## GSM687023.CEL 7
## GSM687024.CEL 8
## GSM687025.CEL 9
## GSM687026.CEL 10
# ----------------------------------------------------------
# MeanSdPlot
# ----------------------------------------------------------
meanSdPlot(breast_vsnrma, plot = FALSE)$gg + theme(aspect.ratio = 1)

#setwd(paste(projectPath, "plots", sep = "/"))
#dev.copy2pdf(file = "breast_meanSdPlot_vsnrma_normalized.pdf")
## Red line is mostly horizontal. Therefore, mean - standard deviation have no dependence. However, there is a slight upwards trend of the standard deviation.
# ----------------------------------------------------------
# RNA degradation plot: breast data set
# ----------------------------------------------------------
RNAdeg_breast <- AffyRNAdeg(data_breast)
# Shift and scale
par(pty = "s", mai = c(1.1,0.5,0.5,0.1))
plotAffyRNAdeg(RNAdeg_breast, col = rainbow(10))
title(sub = "Breast Cancer Rawdata")

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_rnadeg_rawdata.pdf")
# Shift
par(pty = "s", mai = c(1.1,0.5,0.5,0.1))
plotAffyRNAdeg(RNAdeg_breast, col = rainbow(10), transform = "shift.only")
title(sub = "Breast Cancer Scaled")

#dev.copy2pdf(file = "breast_rnadeg_shift_rawdata.pdf")
## No crossing of lines was detected. The quality of the chips is good.
# ----------------------------------------------------------
# Histogram before and after normalization
# ----------------------------------------------------------
## Before normalization
par(mai = c(0.9,0.9,0.9,0.5))
hist(data_breast,
col = rainbow(10),
main = "Density function of log intensity (data set: GSE27830, Foekens et al.)\nbefore normalization")

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_hist_rawdata.pdf")
## After normalization
par(mai = c(0.9,0.9,0.9,0.5))
plot(density(breast_vsnrma_matrix),
type = "n", xlab = "log Intensity", ylim = c(0, 0.6),
main = "Density function of log intensity (data set: GSE27830, Foekens et al.)\nafter normalization")
for (i in 1:ncol(breast_vsnrma_matrix)){
lines(density(breast_vsnrma_matrix[,i]), col = rainbow(10)[i])
}

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_hist_vsnrma_normalized.pdf")
# ----------------------------------------------------------
# Boxplot before and after normalization
# ----------------------------------------------------------
## Before normalization
par(las = 2, mai = c(1, 0.8, 0.7, 0.1))
boxplot(data_breast, names = breast_microarrays[, "number"],
col = rainbow(10), ylab = "Intensity values",
main = "Gene expression in breast cancer before vsnrma normalization\ndata set: GSE27830 (Foekens et al.)",
cex.axis=0.8)

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_boxplot_rawdata.pdf")
## After normalization
par(las = 2, mai = c(1, 0.8, 0.7, 0.1))
boxplot(breast_vsnrma_matrix, names = breast_microarrays[, "number"],
col = rainbow(10), ylab = "Intensity values",
main = "Gene expression in breast cancer after vsnrma normalization\ndata set: GSE27830 (Foekens et al.)",
cex.axis=0.8)

#setwd(paste(projectPath, "plots" ,sep = "/"))
#dev.copy2pdf(file = "breast_boxplot_vsnrma_normalized.pdf")
# ----------------------------------------------------------
# Scatterplots
# ----------------------------------------------------------
## We plotted all scatterplots and found no abnormal distribution. Therefore, we decided against showing the numerous scatterplots. However, if there is need to review them, below you can see the code we used.
#setwd(paste(projectPath, "plots" ,sep = "/"))
#par(mai = c(0.9, 0.9, 0.7, 0.3))
#for (i in 1:9) {
#for (j in (i+1):10){
#plot(breast_vsnrma_matrix[,i], breast_vsnrma_matrix[,j], pch = ".",
#xlab = breast_microarrays[, "number"][i],
#ylab = breast_microarrays[, "number"][j])
#abline(0, 1, col = "red")
#title(main = paste("Scatterplot of probe",
#breast_microarrays[, "number"] [i],
#"and",
#breast_microarrays[, "number"] [i+1],
#sep = " ", collapse = NULL))
#file.name = paste("Scatterplot_",
#as.character(breast_microarrays[, "number"] [i],
#"_", as.character(breast_microarrays[, "number"][i+1]), ".pdf",
#sep =" "))
#dev.copy2pdf(file = file.name)
#}
#}
Descriptive Statistics
# ----------------------------------------------------------
# Boxplots containing all TRA genes present in breast cancer
# ----------------------------------------------------------
# Five boxplots are created, each counting 50 genes, to ensure better visibility and accessibility of each gene.
par(las = 2,
cex.axis = 0.5, cex.main = 0.8,
mai = c(0.65,0.4,0.4,0.1))
for (i in seq(1,250,by = 50)){
generange = paste0("(", rownames(breast_GeneExprs_sub_combined)[i],
" - ",
rownames(breast_GeneExprs_sub_combined)[i+49],")")
boxplot(t(breast_GeneExprs_sub_combined[i:(i+49),filter(breast_microarrays)$number]),
col = rainbow(50),
main= paste0("Boxplot of TRA gene expression in breast cancer ", generange, "\ndata set: GSE27830 (Foekens et al.)"),
horizontal = FALSE)
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))
filename = paste0("breast_geneexpression_boxplot_",
rownames(breast_GeneExprs_sub_combined)[i],"_",
rownames(breast_GeneExprs_sub_combined)[i+49],".pdf")
print(filename)
setwd(paste(projectPath, "plots", sep = "/"))
# dev.copy2pdf(file = filename)
}

## [1] "breast_geneexpression_boxplot_AAK1_CPB1.pdf"

## [1] "breast_geneexpression_boxplot_CRACR2B_GRB10.pdf"

## [1] "breast_geneexpression_boxplot_GRK4_PAFAH2.pdf"

## [1] "breast_geneexpression_boxplot_PAK3_SEL1L.pdf"

## [1] "breast_geneexpression_boxplot_SERPINA3_ZNF780A.pdf"
# ----------------------------------------------------------
# Heatmaps
# ----------------------------------------------------------
filenames <- rownames(pData(data_breast))
breast_samples <- substr(filenames, 1, 9)
# Heatmap of the gene expression in breast cancer.
pheatmap(t(breast_GeneExprs_sub),
labels_row = paste(breast_samples),
labels_col = rep("",nrow(breast_GeneExprs_sub)),
main = "Gene expression in breast cancer \ndata set: GSE27830 (Foekens et al.)",
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize_row = 8, fontsize = 10)

# Gene expression of breast cancer after combining.
pheatmap(t(breast_GeneExprs_sub_combined),
labels_row = paste(breast_samples),
labels_col = rep("",nrow(breast_GeneExprs_sub_combined)),
main = "Gene expression in breast cancer after combination \ndata set: GSE27830 (Foekens et al.)",
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize_row = 8, fontsize = 10)

Analysis
## Mean of entire set
mean(breast_GeneExprs_sub_combined)
## [1] 7.241704
## 7.241704
## Median of entire set
median(breast_GeneExprs_sub_combined)
## [1] 6.766971
## 6.766971
## Standard deviation of entire set
sd(breast_GeneExprs_sub_combined)
## [1] 1.41852
## 1.41852
# ----------------------------------------------------------
# High variance
# ----------------------------------------------------------
## Calculate variance of rows (genes)
var_breast <- apply(breast_GeneExprs_sub_combined, 1, var, na.rm=TRUE)
## Create function to filter out the top 25% of variance.
top25 <- function(x) {
x >= quantile(x, 0.75)
}
## Apply function to variance set, to determine the top 25% of variance.
top25_breast <- top25(var_breast)
top25_breast_names <- which(top25_breast == "TRUE")
high_var_breast_sub <- breast_GeneExprs_sub_combined[top25_breast_names, , drop = FALSE]
dim(high_var_breast_sub)
## [1] 63 10
## data set complete: 250 genes
## data set high variance: 63 genes
# Boxplot of 25% high variance genes
par(las = 2)
par(mai = c(0.7,0.4,0.5,0.1))
boxplot(t(high_var_breast_sub),
col = rainbow(63),
cex.axis = 0.5,
main = "Top 25% high variance genes \ndata set: GSE27830 (Foekens et al.)");
abline(h = median(breast_GeneExprs_sub_combined), col = "red")

# ----------------------------------------------------------
# Genes with high expression levels
# ----------------------------------------------------------
# Median
breast_median <- apply(breast_GeneExprs_sub_combined, 1, median, na.rm=TRUE)
median(breast_median)
## [1] 6.754704
## Median: 6.754704
## Search for genes, whose median gene expression are more than 1.5 times that of the median.
which(breast_median > (1.5 * median(breast_median)))
## CYB5A GNAS NUPR1 RPL5 RPL8 RPS9 SERPINA3 SPP1
## 58 93 143 192 193 194 201 222
## SSR4 XBP1
## 225 246
## Names of genes that fit this criteria: 10 in total.
## CYB5A GNAS NUPR1 RPL5 RPL8 RPS9 SERPINA3 SPP1 SSR4 XBP1
## Expression values of those genes:
breast_median_Gene_Exprs <- breast_GeneExprs[which(breast_median > (1.5 * median(breast_median)))]
breast_median_Gene_Exprs
## [1] 6.690879 8.422208 7.805209 6.782207 5.740733 5.712064 7.076842 6.705710
## [9] 6.597868 5.970241
# Pancreas TRA with an median expression above the overall median:
medianbreast <- median(breast_GeneExprs_sub_combined)
length(which(apply(breast_GeneExprs_sub_combined,1,median) > medianbreast))
## [1] 124
# 124 genes have a higher expression than the overall median.
# Creating matrix with overexpressed TRAs in decreasing order:
overexpressedbreast <- breast_GeneExprs_sub_combined[which(apply(breast_GeneExprs_sub_combined,1,median)> medianbreast),]
overexpressedbreastordered <- overexpressedbreast[order(apply(overexpressedbreast,1,median),decreasing=TRUE),]
# Boxplot with all the genes that are higher expressed than overall median.
par(las =2, mai = c(0.7,0.6,0.6,0.2))
boxplot(t(overexpressedbreast),
col = rainbow(124),
cex.axis = 0.5,
main="Breast Cancer gene expression of overexpressed pancreas TRAs \ndata set: GSE27830 (Foekens et al.)")
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))

# Boxplot with the top 25 genes that are higher expressed than the median.
par(las =2, mai = c(1.2,0.6,0.6,0.2))
boxplot(t(overexpressedbreastordered[1:25,]),
col = rainbow(25),
cex.axis = 1,
main="Gene expression of the 25 highest expressed pancreas TRAs \ndata set: GSE27830 (Foekens et al.)")
abline(h = median(apply(breast_vsnrma_matrix, 1, median)))
